Introduction

This HTML report details an exploration of the nature of circadian genes in melanoma cell lines from the Broad Institute Depmap datasets using the depmap R package, cancer dependency data described by Tsherniak, Aviad, et al. “Defining a cancer dependency map.” Cell 170.3 (2017): 564-576.. The DepMap data enables us to look at CRISPR essentiality, copy number, mutation calls, and transcript expression for a large panel of cancer cell lines, including melanoma.

Gene lists

We will interrogate Depmap data using a curated list of genes that regulate the human circadian rhythm, which were taken from the publication The Circadian Clock Component RORA Increases Immunosurveillance in Melanoma by Inhibiting PD-L1 Expression, Liu, et al. 2024.

circadian_gene_list <- c(
  "RORA", "PER3", "PER1", "CRY2", "RORB", "PER2", "NR1D2", "RORC", "CRY1",
  "ARNTL", "NPAS2", "CLOCK", "NR1D1", "ARNTL2")

# driver_gene_list <- c(
#   "BRAF", "NRAS", "NF1", "CDKN2A", "PTEN", "TP53", "TERT", "KIT", "GNAQ",
#   "GNA11", "MITF", "MC1R", "CDK4")

Biomart

Here, we connect to Ensembl via biomaRt (disabled by eval=FALSE so it doesn’t automatically run each time). This code retrieves gene metadata (e.g., gene names, coordinates) and saves it as an .rds file.

## Connect to the Ensembl database
ensembl <- useMart("ensembl", dataset = "hsapiens_gene_ensembl")

## get information for all human genes (to be used later)
getBM(
  attributes = c(
  "external_gene_name", "description", "entrezgene_id", "ensembl_gene_id",
  "chromosome_name", "start_position", "end_position"),
      mart = ensembl) %>%
  dplyr::rename(gene_name = external_gene_name,
                entrez_id = entrezgene_id,
                ensembl_id = ensembl_gene_id,
                chromosome = chromosome_name,
                start = start_position,
                end = end_position) %>%
  dplyr::filter(stringr::str_length(gene_name) > 1,
                stringr::str_length(chromosome) < 3) %>%
  dplyr::mutate(description = gsub("\\[Source:.*", "", description)) %>%
  as.data.frame() -> human_genes

saveRDS(human_genes, file = "./data/human_genes.rds")
human_genes <- readRDS(file = "./data/human_genes.rds")

We filter the previously loaded human_genes for only the circadian and driver genes, then display those circadian genes in an interactive datatable.

human_genes %>%
  dplyr::filter(gene_name %in% circadian_gene_list) %>%
  dplyr::arrange(gene_name) %>%
  as.data.frame() -> circadian_genes

# human_genes %>%
#   dplyr::filter(gene_name %in% driver_gene_list) %>%
#   dplyr::arrange(gene_name) %>%
#   as.data.frame() -> driver_genes

DT::datatable(circadian_genes)

Depmap

We create an ExperimentHub object to pull relevant DepMap datasets:

crispr: CRISPR-Cas9 essentiality (dependency) scores
copyNumber: copy number alterations per gene/cell line
TPM: transcript expression levels (RNA-seq)
mutationCalls: mutation data
metadata: cell-line metadata (including disease subtype).
## create ExperimentHub query object
eh <- ExperimentHub()
query(eh, "depmap")
#> ExperimentHub with 82 records
#> # snapshotDate(): 2024-10-24
#> # $dataprovider: Broad Institute
#> # $species: Homo sapiens
#> # $rdataclass: tibble
#> # additional mcols(): taxonomyid, genome, description,
#> #   coordinate_1_based, maintainer, rdatadateadded, preparerclass, tags,
#> #   rdatapath, sourceurl, sourcetype 
#> # retrieve records with, e.g., 'object[["EH2260"]]' 
#> 
#>            title             
#>   EH2260 | rnai_19Q1         
#>   EH2261 | crispr_19Q1       
#>   EH2262 | copyNumber_19Q1   
#>   EH2263 | RPPA_19Q1         
#>   EH2264 | TPM_19Q1          
#>   ...      ...               
#>   EH7555 | copyNumber_22Q2   
#>   EH7556 | TPM_22Q2          
#>   EH7557 | mutationCalls_22Q2
#>   EH7558 | metadata_22Q2     
#>   EH7559 | achilles_22Q2
metadata <- eh[["EH7558"]]
crispr <- eh[["EH7554"]]
copyNumber <- eh[["EH7555"]]
TPM <- eh[["EH7556"]]
mutationCalls <- eh[["EH7557"]]

Metadata

We subset DepMap metadata for melanoma lines using keywords (“melanoma”) across relevant columns. We display the resulting set of melanoma cell lines in a datatable for easy scanning.

metadata %>%
  dplyr::filter(grepl("melanoma", lineage_subtype) |
                grepl("elanoma", Cellosaurus_NCIt_disease) |
                grepl("elanoma", subtype_disease) | 
                grepl("elanoma", cell_line)) %>%
  dplyr::select(-contains("issues")) %>%
  as.data.frame() -> melanoma_metadata

DT::datatable(melanoma_metadata)

CRISPR Dependency

We visualize how dependent these cell lines are on each circadian gene using [plotly](https://plotly.com/r/. The dashed lines represent:

Red line: the mean dependency across all circadian genes in just melanoma lines.
Green line: the mean dependency across all lines and all genes in DepMap (global average).
crispr %>%
  dplyr::filter(cell_line %in% melanoma_metadata$cell_line,
                gene_name %in% circadian_genes$gene_name,
                !is.na(cell_line)) %>%
  as.data.frame() -> crispr_df

crispr %>% 
  dplyr::filter(!is.na(dependency)) %>%
  as.data.frame() -> crispr_global_dependency
  
crispr_df %>%
  dplyr::filter(!is.na(dependency)) %>%
  dplyr::group_by(gene_name) %>%
  dplyr::summarize(mean_dependency = mean(dependency, na.remove = TRUE)) %>% 
  as.data.frame() -> crispr_gene_mean_dep

crispr_df %>%
  dplyr::left_join(crispr_gene_mean_dep, by = "gene_name") %>%
  dplyr::arrange(desc(mean_dependency)) %>%
  dplyr::mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  dplyr::select(-c(gene, cell_line)) %>% 
  dplyr::left_join(metadata, by = "depmap_id") %>%
  as.data.frame() -> crispr_gene_mean_dep_merged

crispr_gene_mean_dep_merged %>%
  ggplot(aes(x = gene_name, y = dependency, color = subtype_disease)) +
  geom_point(size = 0.75) +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  geom_hline(yintercept = mean(crispr_gene_mean_dep$mean_dependency, na.remove = TRUE),
             color = "darkred", linetype = "dashed") +
  geom_hline(yintercept = mean(crispr_global_dependency$dependency, na.remove = TRUE),
             color = "darkgreen", linetype = "dashed") +
  xlab("circadian rhythm-related genes") +
  ggtitle("Circadian rhythm-related genes ranked by descending mean dependency score") -> p1
ggplotly(p1)

We reshape (pivot_wider) the data so that rows = genes, columns = melanoma cell lines, and values = CRISPR dependency scores. Then we call pheatmap to visualize how essential each gene is across multiple lines in a grid layout.

Interpretation: we don’t observe a strong pattern in dependency scores in circadian genes in melanoma cell lines

crispr_df %>%
  dplyr::select(gene_name, cell_line, dependency) %>%
  tidyr::pivot_wider(names_from = cell_line,
                     values_from = dependency) %>%
  tibble::column_to_rownames(var = "gene_name") %>%
  t() %>% as.data.frame() %>%
  tibble::rownames_to_column(var = "cell_line") %>%
  dplyr::left_join(melanoma_metadata, by = "cell_line") %>%
  as.data.frame() -> crispr_ann_df

data.frame(status = crispr_ann_df$primary_or_metastasis,
           subtype = crispr_ann_df$subtype_disease,
           site = crispr_ann_df$sample_collection_site,
           row.names = crispr_ann_df$cell_line) -> sample_col

crispr_ann_df %>%
  dplyr::select(cell_line:RORB) %>%
  tibble::column_to_rownames(var = "cell_line") %>%
  t() %>%
  pheatmap::pheatmap(
    annotation_col = sample_col,
    show_colnames = FALSE,
    border_color = NA,
    fontsize = 7,
    main = paste0("Heatmap of dependency scores for circadian rhythm-related ",
                  "genes in melanoma cell lines"))

Copy Number

We check log2 copy-number values per gene. A dashed line at y=1 (or y=0, depending on your scale) might indicate a diploid reference. Higher or lower values suggest possible amplification or deletion for these circadian genes in melanoma cells. The genes are arranged by descending average copy number. For more information how Depmap CNV is calculated, please refer to the DepMap documentation.

Interpretation: RORC average copy number is highest in melanoma, but also displays the greatest variation. The other circadian-related genes display variation, but their mean CNV is close to 1

copyNumber %>%
  dplyr::filter(cell_line %in% melanoma_metadata$cell_line,
                gene_name %in% circadian_genes$gene_name,
                !is.na(cell_line)) %>%
  as.data.frame() -> copy_number_df

copy_number_df %>%
  dplyr::arrange(desc(log_copy_number)) %>% 
  dplyr::mutate(gene_name = factor(gene_name, levels = unique(gene_name))) %>%
  ggplot(aes(x = gene_name, y = log_copy_number, fill = gene_name)) +
    geom_violin() +
    geom_boxplot(width = 0.25) +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1),
          legend.position = "none") +
    geom_hline(yintercept = 1, color = "black", linetype = "dashed") +
    ggtitle(paste0("Log10 copy-number for circadian rhythm-related genes in",
                   " melanoma cell lines"))

Gene Expression

We pivot from long to wide, building a matrix of TPM expression with genes as rows and cell lines as columns. Then we optionally annotate columns with metadata (metastatic status, disease subtype) and generate a pheatmap to see expression clustering.

Interpretation: Expression of PER2, RORA, RORB and RORC appears to be very low in all melanome cell lines. NPAS expression is the most elevated, while the remaining genes display expression somewhere in between.

TPM %>%
  dplyr::filter(cell_line %in% melanoma_metadata$cell_line,
                gene_name %in% circadian_genes$gene_name,
                !is.na(cell_line)) %>%
  as.data.frame() -> tpm_df

tpm_df %>%
  dplyr::select(gene_name, cell_line, rna_expression) %>%
  tidyr::pivot_wider(names_from = cell_line,
                     values_from = rna_expression) %>%
  tibble::column_to_rownames(var = "gene_name") %>%
  t() %>% as.data.frame() %>%
  tibble::rownames_to_column(var = "cell_line") %>%
  dplyr::left_join(melanoma_metadata, by = "cell_line") %>%
  as.data.frame() -> tpm_ann_df

data.frame(status = tpm_ann_df$primary_or_metastasis,
           subtype = tpm_ann_df$subtype_disease,
           site = tpm_ann_df$sample_collection_site,
           row.names = tpm_ann_df$cell_line) -> sample_col

tpm_ann_df %>%
  dplyr::select(cell_line:RORB) %>%
  tibble::column_to_rownames(var = "cell_line") %>%
  t() %>%
  pheatmap::pheatmap(
    annotation_col = sample_col,
    fontsize = 7,
    border_color = NA,
    show_colnames = FALSE,
    main = paste0("Heatmap of log10 gene expression for circadian rhythm-related ",
                  "genes in melanoma cell lines"))

We join CRISPR dependency data with expression (TPM) for the same gene/cell line pairs, then plot expression vs. dependency. The stat_cor(method = "spearman") call adds a correlation value to see if higher expression correlates with lower (or higher) essentiality.

Interpretation: We don’t observe a signfificant direct correlation between any circadian rhythm gene expression and CRISPR dependency scores.

Note: Normally, we would expect a relationship where increased gene expression displays increased dependency, if such a gene dependency exists.

tpm_df %>%
  dplyr::select(-c(gene, cell_line, entrez_id)) %>%
  dplyr::left_join(crispr_df, by = c("depmap_id", "gene_name")) %>%
  dplyr::filter(!is.na(cell_line)) %>%
  ggplot(aes(x = dependency, y = rna_expression, colour = gene_name)) +
    geom_point() +
    stat_smooth(method = "lm", se = TRUE, formula = y ~ poly(x, 1, raw = TRUE),
                color = "black", linetype = "dashed", fill = "gray") +
    ggpubr::stat_cor(method = "spearman", label.x = -0.5, label.y = 6) +
    ggtitle(paste0("Correlation between circadian-rhythm gene expression and ",
                   "CRISPR dependency\nscores in Depmap melanoma cell lines")) +
    facet_wrap(~gene_name, ncol = 3)

Mutation Calls

We look at mutationCalls table, combine with metadata, and keep only those entries with a circadian gene in melanoma cell lines.

mutationCalls %>%
  dplyr::left_join(melanoma_metadata, by = "depmap_id") %>%
  dplyr::filter(cell_line %in% melanoma_metadata$cell_line,
                gene_name %in% circadian_genes$gene_name,
                !is.na(cell_line)) %>%
  as.data.frame() -> mutation_calls_df

We create a balloon plot to see how mutation classification (e.g., “Missense”, “Nonsense”, “Splice site”) differs between primary vs. metastatic cell lines.

table(mutation_calls_df$var_class, mutation_calls_df$primary_or_metastasis) %>%
  as.data.frame() %>% 
  ggballoonplot(fill = "value") +
    scale_fill_gradientn(colors = my_cols) +
    ggtitle("Depmap elanoma circadian gene mutation type by metastatic status")

We create a balloon plot to see how mutation classification (e.g., “Missense”, “Nonsense”, “Splice site”) differs between primary vs. metastatic cell lines.

table(mutation_calls_df$var_class, mutation_calls_df$gene_name) %>%
  as.data.frame() %>% 
  ggballoonplot(fill = "value") +
    scale_fill_gradientn(colors = my_cols) +
    ggtitle("Depmap melanoma circadian gene mutation type by gene")

A balloon plot that partitions each bar by the circadian gene mutated, so you can see which genes tend to have which var_class.

table(mutation_calls_df$var_annotation, mutation_calls_df$primary_or_metastasis) %>%
  as.data.frame() %>% 
  ggballoonplot(fill = "value") +
    scale_fill_gradientn(colors = my_cols) +
    ggtitle("Depmap melanoma circadian gene mutation annotation by metastatic status")

We plot the variant annotation to see how it splits between primary vs metastatic lines.

table(mutation_calls_df$var_annotation, mutation_calls_df$gene_name) %>%
  as.data.frame() %>% 
  ggballoonplot(fill = "value") +
    scale_fill_gradientn(colors = my_cols) +
    ggtitle("Depmap melanoma circadian gene mutation annotations by gene")

We create a custom column mutation_type that concatenates ref_allele and alt_allele (e.g., C_to_T) to highlight known mutational signatures. The bar chart categorizes them by primary vs. metastatic. This often reveals a high frequency of UV-associated C>T transitions in melanoma.

UV Exposure: In primary cutaneous melanomas, C to T (and symmetrically G to A) transitions at dipyrimidine sites are the prototypical “UV signature.” Given the strong etiological of melanoma link with UV radiation, it is no surprise these transitions dominate.

mutation_calls_df %>%
  dplyr::mutate(mutation_type = paste0(ref_allele, "_to_", alt_allele)) %>%
  as.data.frame() -> mutation_calls_df2

table(mutation_calls_df2$mutation_type, mutation_calls_df2$primary_or_metastasis) %>% 
  as.data.frame() %>% 
  ggballoonplot(fill = "value") +
    scale_fill_gradientn(colors = my_cols) +
    ggtitle("Depmap melanoma circadian gene mutation transitions by metastatic status")

mutation_calls_df %>%
  dplyr::mutate(mutation_type = paste0(ref_allele, "_to_", alt_allele)) %>%
  as.data.frame() -> mutation_calls_df2

table(mutation_calls_df2$mutation_type, mutation_calls_df2$gene_name) %>% 
  as.data.frame() %>% 
  ggballoonplot(fill = "value") +
    scale_fill_gradientn(colors = my_cols) +
    ggtitle("Depmap melanoma circadian gene mutation transitions by gene")

Summary

  1. Data Filtering: We demonstrated how to access and filter DepMap datasets to melanoma lines and circadian genes.
  2. Visualizations: We demonstrated how to visualize Depmap data using bar plots, heatmaps, scatter plots, and interactive plots via plotly show how circadian gene dependency, copy number, expression, and mutations vary across these cell lines.
  3. Future Directions:

This exploration underscores the utility of DepMap data and R-based visualization workflows to generate hypotheses about circadian gene roles in melanoma cell-line survival, copy number changes, and potentially UV-signature–associated mutations.

sessionInfo()
#> R version 4.4.2 (2024-10-31)
#> Platform: x86_64-pc-linux-gnu
#> Running under: Ubuntu 24.04.2 LTS
#> 
#> Matrix products: default
#> BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.12.0 
#> LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0
#> 
#> locale:
#>  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
#>  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
#>  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
#>  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
#>  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
#> [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
#> 
#> time zone: Europe/Brussels
#> tzcode source: system (glibc)
#> 
#> attached base packages:
#> [1] stats     graphics  grDevices utils     datasets  methods   base     
#> 
#> other attached packages:
#>  [1] DT_0.33              pheatmap_1.0.12      plotly_4.10.4       
#>  [4] biomaRt_2.62.1       ExperimentHub_2.14.0 AnnotationHub_3.14.0
#>  [7] BiocFileCache_2.14.0 dbplyr_2.5.0         BiocGenerics_0.52.0 
#> [10] depmap_1.20.0        ggpubr_0.6.0         ggrepel_0.9.6       
#> [13] stringr_1.5.1        viridis_0.6.5        viridisLite_0.4.2   
#> [16] ggplot2_3.5.1        tibble_3.2.1         tidyr_1.3.1         
#> [19] dplyr_1.1.4         
#> 
#> loaded via a namespace (and not attached):
#>  [1] DBI_1.2.3               gridExtra_2.3           httr2_1.1.0            
#>  [4] rlang_1.1.5             magrittr_2.0.3          compiler_4.4.2         
#>  [7] RSQLite_2.3.9           mgcv_1.9-1              png_0.1-8              
#> [10] vctrs_0.6.5             pkgconfig_2.0.3         crayon_1.5.3           
#> [13] fastmap_1.2.0           backports_1.5.0         XVector_0.46.0         
#> [16] labeling_0.4.3          rmarkdown_2.29          UCSC.utils_1.2.0       
#> [19] purrr_1.0.4             bit_4.5.0.1             xfun_0.50              
#> [22] zlibbioc_1.52.0         cachem_1.1.0            GenomeInfoDb_1.42.3    
#> [25] jsonlite_1.8.9          progress_1.2.3          blob_1.2.4             
#> [28] broom_1.0.7             prettyunits_1.2.0       R6_2.6.1               
#> [31] bslib_0.9.0             stringi_1.8.4           RColorBrewer_1.1-3     
#> [34] car_3.1-3               jquerylib_0.1.4         Rcpp_1.0.14            
#> [37] knitr_1.49              IRanges_2.40.1          Matrix_1.7-2           
#> [40] splines_4.4.2           tidyselect_1.2.1        rstudioapi_0.17.1      
#> [43] abind_1.4-8             yaml_2.3.10             curl_6.2.0             
#> [46] lattice_0.22-6          Biobase_2.66.0          withr_3.0.2            
#> [49] KEGGREST_1.46.0         evaluate_1.0.3          xml2_1.3.6             
#> [52] Biostrings_2.74.1       pillar_1.10.1           BiocManager_1.30.25    
#> [55] filelock_1.0.3          carData_3.0-5           stats4_4.4.2           
#> [58] generics_0.1.3          BiocVersion_3.20.0      S4Vectors_0.44.0       
#> [61] hms_1.1.3               munsell_0.5.1           scales_1.3.0           
#> [64] glue_1.8.0              lazyeval_0.2.2          tools_4.4.2            
#> [67] data.table_1.16.4       ggsignif_0.6.4          grid_4.4.2             
#> [70] crosstalk_1.2.1         AnnotationDbi_1.68.0    colorspace_2.1-1       
#> [73] nlme_3.1-167            GenomeInfoDbData_1.2.13 Formula_1.2-5          
#> [76] cli_3.6.4               rappdirs_0.3.3          gtable_0.3.6           
#> [79] rstatix_0.7.2           sass_0.4.9              digest_0.6.37          
#> [82] farver_2.1.2            htmlwidgets_1.6.4       memoise_2.0.1          
#> [85] htmltools_0.5.8.1       lifecycle_1.0.4         httr_1.4.7             
#> [88] mime_0.12               bit64_4.6.0-1